Installing necessary packages to extract HTML elements from the website

#install.packages("rvest") #allows us to parse HTML content and extract the HTML elements from it. 
#install.packages("xml2")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.1.8
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(xml2) 
library(rvest)
## 
## Attaching package: 'rvest'
## 
## The following object is masked from 'package:readr':
## 
##     guess_encoding
library(ggplot2)

EDA: Data Wrangling for Reviews

# 
# # Data Wrangling for every X1 attribute; since the data is in vertical format it is necessary to make it in record format for further analysis
# reviews_final_data <- data.frame(matrix(ncol = 13, nrow = 3000), stringsAsFactors = FALSE)
# colnames(reviews_final_data) <- c('Aircraft', 'Type Of Traveller', 'Seat Type', 'Route','Date Flown', 'Seat Comfort', 'Cabin Staff Service', 'Food & Beverages', 'Inflight Entertainment', 'Ground Service', 'Wifi & Connectivity', 'Value For Money', 'Recommended')
# ir = 1
# ic = 1
# for(i in 1:nrow(provisional_data)){
#    if(ic >= 14){
#      ir = ir+1
#      ic <- 1
#    }
#   if(ic==1){
#     if(grepl("Aircraft", provisional_data$X1[i] == TRUE)){
#       reviews_final_data[ir,ic] = provisional_data$X2[i]
#       i = i + 1
#     } else{
#       reviews_final_data[ir,ic] = "9999"
#     }
#     ic = ic + 1
#   } 
#   if(ic == 2){
#     if(grepl("Type Of Traveller", provisional_data$X1[i]) == TRUE){
#       reviews_final_data[ir,ic] = provisional_data$X2[i]
#       i = i + 1
#     }else {
#       reviews_final_data[ir,ic] = "9999"
#     }
#     ic = ic + 1
#   } 
#   if(ic == 3){
#     if(grepl("Seat Type", provisional_data$X1[i]) == TRUE){
#       reviews_final_data[ir,ic] = provisional_data$X2[i]
#       i = i + 1
#     } else {
#       reviews_final_data[ir,ic] = "9999"
#     }
#     ic = ic + 1
#   } 
#   if(ic == 4){
#     if(grepl("Route", provisional_data$X1[i]) == TRUE){
#       reviews_final_data[ir,ic] = provisional_data$X2[i]
#       i = i + 1
#     }else {
#       reviews_final_data[ir,ic] = "9999"
#     }
#     ic = ic + 1
#   }
#   if(ic == 5){
#     if(grepl("Date Flown", provisional_data$X1[i]) == TRUE){
#       reviews_final_data[ir,ic] = provisional_data$X2[i]
#       i = i + 1
#     }else {
#       reviews_final_data[ir,ic] = "9999"
#     }
#     ic = ic + 1
#   } 
#   if(ic == 6){
#     if(grepl("Seat Comfort", provisional_data$X1[i]) == TRUE){
#       reviews_final_data[ir,ic] = provisional_data$X2[i]
#       i = i + 1
#     }else {
#       reviews_final_data[ir,ic] = "9999"
#     }
#     ic = ic + 1
#   } 
#   if(ic == 7){
#     if(grepl("Cabin Staff Service", provisional_data$X1[i]) == TRUE){
#       reviews_final_data[ir,ic] = provisional_data$X2[i]
#       i = i + 1
#     }else {
#       reviews_final_data[ir,ic] = "9999"
#     }
#     ic = ic + 1
#   } 
#   if(ic == 8){
#     if(grepl("Food & Beverages", provisional_data$X1[i]) == TRUE){
#       reviews_final_data[ir,ic] = provisional_data$X2[i]
#       i = i + 1
#     }else {
#       reviews_final_data[ir,ic] = "9999"
#     }
#     ic = ic + 1
#   } 
#   if(ic == 9){
#     if(grepl("Inflight Entertainment", provisional_data$X1[i]) == TRUE){
#       reviews_final_data[ir,ic] = provisional_data$X2[i]
#       i = i + 1
#     }else {
#       reviews_final_data[ir,ic] = "9999"
#     }
#     ic = ic + 1
#   } 
#   if(ic == 10){
#     if(grepl("Ground Service", provisional_data$X1[i]) == TRUE){
#       reviews_final_data[ir,ic] = provisional_data$X2[i]
#       i = i + 1
#     }else {
#       reviews_final_data[ir,ic] = "9999"
#     }
#     ic = ic + 1
#   } 
#   if(ic == 11){
#     if(grepl("Wifi & Connectivity", provisional_data$X1[i]) == TRUE){
# #       reviews_final_data[ir,ic] = provisional_data$X2[i]
# #       i = i + 1
# #     }else {
# #       reviews_final_data[ir,ic] = "9999"
# #     }
# #     ic = ic + 1
# #   } 
# #   if(ic == 12){
# #     if(grepl("Value For Money", provisional_data$X1[i]) == TRUE){
# #       reviews_final_data[ir,ic] = provisional_data$X2[i]
# #       i = i + 1
# #     }else {
# #       reviews_final_data[ir,ic] = "9999"
# #     }
# #     ic = ic + 1
# #   } 
# #   if (ic == 13){
# #     if(grepl("Recommended", provisional_data$X1[i]) == TRUE){
# #       reviews_final_data[ir,ic] = provisional_data$X2[i]
# #       i = i + 1
# #     }else {
# #       reviews_final_data[ir,ic] = "9999"
# #     }
# #     ic = ic + 1
# #   }
# # 
# # }
# 
# # Define the column order based on column names
# column_order <- c("Aircraft", "Type Of Traveller", "Seat Type", "Route",
#                   "Date Flown", "Seat Comfort", "Cabin Staff Service",
#                   "Food & Beverages", "Inflight Entertainment", "Ground Service",
#                   "Wifi & Connectivity", "Value For Money", "Recommended")
# 
# # Initialize the output data frame with 9999 values
# reviews_final_data <- data.frame(matrix(rep("9999", length(column_order) * nrow(provisional_data)),
#                                         nrow = nrow(provisional_data),
#                                         ncol = length(column_order),
#                                         dimnames = list(NULL, column_order)))
# 
# # Loop through the input data and fill in the output data frame
# ic <- 1
# ir <- 1
# for (i in 1:nrow(provisional_data)) {
#   if (ic > length(column_order)) {
#     ir <- ir + 1
#     ic <- 1
#   }
#   if (grepl(column_order[ic], provisional_data$X1[i])) {
#     reviews_final_data[ir, ic] <- provisional_data$X2[i]
#     ic <- ic + 1
#   }
# }


# Code iteration perfected
# Initialize reviews_final_data with "9999"
reviews_final_data <- matrix(ncol = 13, nrow = 2159)
# Loop through each row of provisional_data and populate reviews_final_data
ir <- 1
ic <- 1
for(i in 1:nrow(provisional_data)){
  
  # Check if ic is greater than or equal to 14
  if(ic >= 14){
    ir <- ir + 1
    ic <- 1
  }
  
  # Populate the corresponding element of reviews_final_data based on the column header
  if(provisional_data$X1[i] == "Aircraft"){
    if(ir > 2159)
  {
    break
  }
    reviews_final_data[ir, 1] <- provisional_data$X2[i]
  } else if(provisional_data$X1[i] == "Type Of Traveller"){
    if(ir > 2159)
  {
    break
  }
    reviews_final_data[ir, 2] <- provisional_data$X2[i]
  } else if(provisional_data$X1[i] == "Seat Type"){
    if(ir > 2159)
  {
    break
  }
    reviews_final_data[ir, 3] <- provisional_data$X2[i]
  } else if(provisional_data$X1[i] == "Route"){
    if(ir > 2159)
  {
    break
  }
    reviews_final_data[ir, 4] <- provisional_data$X2[i]
  } else if(provisional_data$X1[i] == "Date Flown"){
    if(ir > 2159)
  {
    break
  }
    reviews_final_data[ir, 5] <- provisional_data$X2[i]
  } else if(provisional_data$X1[i] == "Seat Comfort"){
    if(ir > 2159)
  {
    break
  }
    reviews_final_data[ir, 6] <- provisional_data$X2[i]
  } else if(provisional_data$X1[i] == "Cabin Staff Service"){
    if(ir > 2159)
  {
    break
  }
    reviews_final_data[ir, 7] <- provisional_data$X2[i]
  } else if(provisional_data$X1[i] == "Food & Beverages"){
    if(ir > 2159)
  {
    break
  }
    reviews_final_data[ir, 8] <- provisional_data$X2[i]
  } else if(provisional_data$X1[i] == "Inflight Entertainment"){
    if(ir > 2159)
  {
    break
  }
    reviews_final_data[ir, 9] <- provisional_data$X2[i]
  } else if(provisional_data$X1[i] == "Ground Service"){
    if(ir > 2159)
  {
    break
  }
    reviews_final_data[ir, 10] <- provisional_data$X2[i]
  } else if(provisional_data$X1[i] == "Wifi & Connectivity"){
    if(ir > 2159)
  {
    break
  }
    reviews_final_data[ir, 11] <- provisional_data$X2[i]
  } else if(provisional_data$X1[i] == "Value For Money"){
    if(ir > 2159)
  {
    break
  }
    reviews_final_data[ir, 12] <- provisional_data$X2[i]
  } else if(provisional_data$X1[i] == "Recommended"){
    if(ir > 2159)
  {
    break
  }
    reviews_final_data[ir, 13] <- provisional_data$X2[i]
  }
  
  # Increment ic
  ic <- ic + 1
}



# Removing the columns since they do not make any sense, we have separate star ratings columns that have been implemented below.
reviews_final_data <- reviews_final_data[,-c(6:12)]
reviews_final_data <- data.frame(reviews_final_data)
colnames(reviews_final_data) <- c("Aircraft", "Type.Of.Traveller", "Seat.Type", "Route", "Date Flown", "Recommended")

EDA: Data Wrangling for Star Ratings

#Data Wrangling for Star Ratings
revised_ratings <- data.frame(matrix(ncol = 7, nrow = 2159))
colnames(revised_ratings) <- c('Seat Comfort', 'Cabin Staff Service', 'Food & Beverages', 'Inflight Entertainment', 'Ground Service', 'Wifi & Connectivity', 'Value For Money')
ir = 1
ic = 1
for(i in 1:nrow(final_ratings)) {       # for-loop over columns
 revised_ratings[ir, ic] <- final_ratings[i, 1]
    if (i%%7==0) {
      ir <- ir+1
      ic <- 1
    }
    else if(ir == 2160){
      break
  } else {
      ic <- ic+1
    }

}

revised_ratings <- revised_ratings[-2160,]

Data Integration

# Combining the three data frames  (signifies other attributes apart from star ratings), revised_ratings (signifies star ratings), and text_reviews_till_page_10 (Signifies customer reviews)
reviews <- cbind(reviews_final_data, revised_ratings, text_reviews)

Data Cleaning

library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
# Adding a new column to calculate the total score
ratings_cols <- c("Seat Comfort", "Cabin Staff Service", "Food & Beverages",
                  "Inflight Entertainment", "Ground Service", "Wifi & Connectivity", "Value For Money")

# Add up the values in the five columns and calculate the mean
  reviews <- reviews %>% mutate(`Total Rating` = rowSums(reviews[, ratings_cols])/7)
# The reviews column has a shabby format, for example the review has the verification before the review starts, we can separately create a verification column to check the integrity of the review
reviews <- reviews %>% mutate(Verification = ifelse(grepl("✅ Trip Verified", Reviews), "Trip Verified", ifelse(grepl("Not Verified", Reviews), "Not Verified", NA)))

# Remove the substring from the original column
reviews$Reviews  <- gsub("✅ Trip Verified \\| ", "", reviews$Reviews)
reviews$Reviews  <- gsub("✅ Trip Verified |", "", reviews$Reviews)
reviews$Reviews <- gsub("Not Verified \\|","", reviews$Reviews)
reviews$Reviews <- gsub("|","",reviews$Reviews)
# We can break down the "Route" column into two parts: "Origin" and "Destination"
library(tidyr)
Route_updated <- data.frame(reviews$Route)
# separate "Route" column into "Origin location" and "Destination"
Route_updated <- separate(Route_updated, col = reviews.Route, into = c("Origin", "Destination"), sep = c(" to "))
## Warning: Expected 2 pieces. Additional pieces discarded in 2 rows [301, 1471].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 4 rows [1337, 1752, 1941,
## 1976].
head(Route_updated)
##          Origin            Destination
## 1      New York                   Rome
## 2   Los Angeles              Melbourne
## 3     Cape Town                Atlanta
## 4      San Juan Paris via New York JFK
## 5 Ft Lauderdale                Atlanta
## 6 Ft Lauderdale                Atlanta
reviews <- reviews[,-4]

reviews <- cbind(reviews, Route_updated)
# Formatting the Date Flown column in the Data frame.
class(reviews$`Date Flown`)
## [1] "character"
library(lubridate)
# Convert Date Flown to MM-YY format
reviews$`Date Flown` <- dmy(paste0("01-", reviews$`Date Flown`))
## Warning: 178 failed to parse.
reviews$`Date Flown` <- format(reviews$`Date Flown`, "%m-%y")

head(reviews$`Date Flown`)
## [1] "03-23" "04-23" "03-23" "04-23" "04-23" "04-23"
# Identifying the missing values
cat("There are", sum(is.na(reviews$Aircraft)), "missing data values in 'Aircraft' column.\n")
## There are 1614 missing data values in 'Aircraft' column.
cat("There are", sum(is.na(reviews$Type.Of.Traveller)), "missing data values in 'Type of traveller' column.\n")
## There are 177 missing data values in 'Type of traveller' column.
cat("There are", sum(is.na(reviews$Seat.Type)), "missing data values in 'Seat Type' column.\n")
## There are 0 missing data values in 'Seat Type' column.
cat("There are", sum(is.na(reviews$`Date Flown`)), "missing data values in 'Date Flown' column.\n")
## There are 178 missing data values in 'Date Flown' column.
cat("There are", sum(is.na(reviews$Recommended)), "missing data values in 'Recommended' column.\n")
## There are 0 missing data values in 'Recommended' column.
cat("There are", sum(is.na(reviews$`Seat Comfort`)), "missing data values in 'Seat Comfort' column.\n")
## There are 0 missing data values in 'Seat Comfort' column.
cat("There are", sum(is.na(reviews$`Cabin Staff Service`)), "missing data values in 'Cabin Staff Services' column.\n")
## There are 0 missing data values in 'Cabin Staff Services' column.
cat("There are", sum(is.na(reviews$`Food & Beverages`)), "missing data values in 'Food & Beverages' column.\n")
## There are 0 missing data values in 'Food & Beverages' column.
cat("There are", sum(is.na(reviews$`Inflight Entertainment`)), "missing data values in 'Inflight Entertainment' column.\n")
## There are 0 missing data values in 'Inflight Entertainment' column.
cat("There are", sum(is.na(reviews$`Ground Service`)), "missing data values in 'Ground Service' column.\n")
## There are 0 missing data values in 'Ground Service' column.
cat("There are", sum(is.na(reviews$`Wifi & Connectivity`)), "missing data values in 'Wifi & Connectivity' column.\n")
## There are 0 missing data values in 'Wifi & Connectivity' column.
cat("There are", sum(is.na(reviews$`Value For Money`)), "missing data values in 'Value for Money' column.\n")
## There are 0 missing data values in 'Value for Money' column.
cat("There are", sum(is.na(reviews$Reviews)), "missing data values in 'Reviews' column.\n")
## There are 0 missing data values in 'Reviews' column.
cat("There are", sum(is.na(reviews$Verification)), "missing data values in 'Verification' column.\n")
## There are 5 missing data values in 'Verification' column.
cat("There are", sum(is.na(reviews$Origin)), "missing data values in 'Origin' column.\n")
## There are 187 missing data values in 'Origin' column.
cat("There are", sum(is.na(reviews$Destination)), "missing data values in 'Destination' column.\n")
## There are 191 missing data values in 'Destination' column.
# Removing the 'Aircraft Column' since it doesn't contain any valuable information or there are 1621 missing values.
reviews <- reviews[,-1]

# Replacing the NAs in 'Type of Traveller' to random values from "Solo Leisure", "Couple Leisure", "Family Leisure" and "Business".
reviews$Type.Of.Traveller <- ifelse(is.na(reviews$Type.Of.Traveller), sample(c("Solo Leisure", "Business", "Couple Leisure", "Family Leisure"), 1), reviews$Type.Of.Traveller)

reviews$Seat.Type <- ifelse(is.na(reviews$Seat.Type), sample(c("Economy Class", "First Class", "Premium Economy", "Business Class"),1), reviews$Seat.Type)

# Replacing the NAs in 'Date Flown' to random values between April 2015 to September 2022
months <- seq(from = ymd("2015-04-01"), to = ymd("2022-09-01"), by = "months")
months_str <- format(months, "%m-%y")

reviews$`Date Flown`[is.na(reviews$`Date Flown`)] <- sample(months_str, sum(is.na(reviews$`Date Flown`)), replace = TRUE)

# Replacing both NA values to "Not Verified"
reviews$Verification[is.na(reviews$Verification)] = "Not Verified"

# Origin and Destination NA values cleaning
reviews <- reviews[complete.cases(reviews[c("Origin", "Destination")]),]

# Destination additional locations cleaning
reviews$Final.Destination <- gsub("(.*via\\s)(\\w+)(\\s\\&\\s)(\\w+)(.*)", "\\2, \\4", reviews$Destination)
reviews$Final.Destination <- gsub("(.*via\\s)(\\w+)(.*)", "\\2", reviews$Final.Destination)
reviews <- reviews[,-16]

# Replace specific destination values in Final.Destination column
reviews$Final.Destination <- ifelse(grepl("New", reviews$Final.Destination), "New York",  ifelse(grepl("Salt", reviews$Final.Destination), "Salt Lake City",reviews$Final.Destination))
cat("There are", sum(is.na(reviews$Type.Of.Traveller)), "missing data values in 'Type of traveller' column.\n")
## There are 0 missing data values in 'Type of traveller' column.
cat("There are", sum(is.na(reviews$Seat.Type)), "missing data values in 'Seat Type' column.\n")
## There are 0 missing data values in 'Seat Type' column.
cat("There are", sum(is.na(reviews$Route)), "missing data values in 'Route' column.\n")
## There are 0 missing data values in 'Route' column.
cat("There are", sum(is.na(reviews$`Date Flown`)), "missing data values in 'Date Flown' column.\n")
## There are 0 missing data values in 'Date Flown' column.
cat("There are", sum(is.na(reviews$Recommended)), "missing data values in 'Recommended' column.\n")
## There are 0 missing data values in 'Recommended' column.
cat("There are", sum(is.na(reviews$`Seat Comfort`)), "missing data values in 'Seat Comfort' column.\n")
## There are 0 missing data values in 'Seat Comfort' column.
cat("There are", sum(is.na(reviews$`Cabin Staff Service`)), "missing data values in 'Cabin Staff Services' column.\n")
## There are 0 missing data values in 'Cabin Staff Services' column.
cat("There are", sum(is.na(reviews$`Food & Beverages`)), "missing data values in 'Food & Beverages' column.\n")
## There are 0 missing data values in 'Food & Beverages' column.
cat("There are", sum(is.na(reviews$`Inflight Entertainment`)), "missing data values in 'Inflight Entertainment' column.\n")
## There are 0 missing data values in 'Inflight Entertainment' column.
cat("There are", sum(is.na(reviews$`Ground Service`)), "missing data values in 'Ground Service' column.\n")
## There are 0 missing data values in 'Ground Service' column.
cat("There are", sum(is.na(reviews$`Wifi & Connectivity`)), "missing data values in 'Wifi & Connectivity' column.\n")
## There are 0 missing data values in 'Wifi & Connectivity' column.
cat("There are", sum(is.na(reviews$`Value For Money`)), "missing data values in 'Value for Money' column.\n")
## There are 0 missing data values in 'Value for Money' column.
cat("There are", sum(is.na(reviews$Reviews)), "missing data values in 'Reviews' column.\n")
## There are 0 missing data values in 'Reviews' column.
cat("There are", sum(is.na(reviews$Verification)), "missing data values in 'Verification' column.\n")
## There are 0 missing data values in 'Verification' column.
cat("There are", sum(is.na(reviews$Origin)), "missing data values in 'Origin' column.\n")
## There are 0 missing data values in 'Origin' column.
cat("There are", sum(is.na(reviews$Destination)), "missing data values in 'Destination' column.\n")
## There are 0 missing data values in 'Destination' column.
summary(reviews)
##  Type.Of.Traveller   Seat.Type          Date Flown        Recommended       
##  Length:1968        Length:1968        Length:1968        Length:1968       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##   Seat Comfort   Cabin Staff Service Food & Beverages Inflight Entertainment
##  Min.   :1.000   Min.   :1.000       Min.   :1.000    Min.   :1.000         
##  1st Qu.:1.000   1st Qu.:1.000       1st Qu.:1.000    1st Qu.:1.000         
##  Median :2.000   Median :2.000       Median :2.000    Median :2.000         
##  Mean   :2.174   Mean   :2.173       Mean   :2.218    Mean   :2.181         
##  3rd Qu.:3.000   3rd Qu.:3.000       3rd Qu.:3.000    3rd Qu.:3.000         
##  Max.   :5.000   Max.   :5.000       Max.   :5.000    Max.   :5.000         
##  Ground Service  Wifi & Connectivity Value For Money   Reviews         
##  Min.   :1.000   Min.   :1.000       Min.   :1.000   Length:1968       
##  1st Qu.:1.000   1st Qu.:1.000       1st Qu.:1.000   Class :character  
##  Median :2.000   Median :2.000       Median :2.000   Mode  :character  
##  Mean   :2.148   Mean   :2.173       Mean   :2.177                     
##  3rd Qu.:3.000   3rd Qu.:3.000       3rd Qu.:3.000                     
##  Max.   :5.000   Max.   :5.000       Max.   :5.000                     
##   Total Rating   Verification          Origin          Final.Destination 
##  Min.   :1.000   Length:1968        Length:1968        Length:1968       
##  1st Qu.:1.714   Class :character   Class :character   Class :character  
##  Median :2.286   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :2.178                                                           
##  3rd Qu.:2.714                                                           
##  Max.   :3.429
dim(reviews)
## [1] 1968   16
Total_ratings_Business <- 0
for(i in 1:nrow(reviews)){
if(reviews$Type.Of.Traveller[i] == "Business"){
  Total_ratings_Business <- Total_ratings_Business + reviews$`Seat Comfort`[i] + reviews$`Cabin Staff Service`[i]+reviews$`Food & Beverages`[i] + reviews$`Inflight Entertainment`[i] + reviews$`Ground Service`[i] + reviews$`Wifi & Connectivity`[i] + reviews$`Value For Money`[i]
}}
Total_ratings_Business
## [1] 5915
Total_ratings_Couple <- 0
for(i in 1:nrow(reviews)){
if(reviews$Type.Of.Traveller[i] == "Couple Leisure"){
  Total_ratings_Couple <- Total_ratings_Couple + reviews$`Seat Comfort`[i] + reviews$`Cabin Staff Service`[i]+reviews$`Food & Beverages`[i] + reviews$`Inflight Entertainment`[i] + reviews$`Ground Service`[i] + reviews$`Wifi & Connectivity`[i] + reviews$`Value For Money`[i]
}}
Total_ratings_Couple
## [1] 6856
Total_ratings_Family <- 0
for(i in 1:nrow(reviews)){
if(reviews$Type.Of.Traveller[i] == "Family Leisure"){
  Total_ratings_Family <- Total_ratings_Family + reviews$`Seat Comfort`[i] + reviews$`Cabin Staff Service`[i]+reviews$`Food & Beverages`[i] + reviews$`Inflight Entertainment`[i] + reviews$`Ground Service`[i] + reviews$`Wifi & Connectivity`[i] + reviews$`Value For Money`[i]
}}
Total_ratings_Family
## [1] 7297
Total_ratings_Solo <- 0
for(i in 1:nrow(reviews)){
if(reviews$Type.Of.Traveller[i] == "Solo Leisure"){
  Total_ratings_Solo <- Total_ratings_Solo + reviews$`Seat Comfort`[i] + reviews$`Cabin Staff Service`[i]+reviews$`Food & Beverages`[i] + reviews$`Inflight Entertainment`[i] + reviews$`Ground Service`[i] + reviews$`Wifi & Connectivity`[i] + reviews$`Value For Money`[i]
}}
Total_ratings_Solo
## [1] 9933

#Word Cloud

# install.packages("wordcloud")
# library(wordcloud)
# install.packages("RColorBrewer")
# library(RColorBrewer)
# install.packages("tm")
# library(tm)
# install.packages("openNLP")
# install.packages("rJava")
# #library(openNLP)
# install.packages("NLP")
# library(NLP)
# 
# # Define a function to extract only adjectives from the text data
# adj_extractor <- function(text) {
#   word_tokenizer <- Maxent_Word_Token_Annotator()
#   pos_tag_annotator <- Maxent_POS_Tag_Annotator()
#   text <- as.String(text)
#   annotations <- NLP::annotate(text, list(word_tokenizer, pos_tag_annotator))
#   adjectives <- subset(annotations$POS, Type == "JJ")
#   adj_words <- text[adjectives]
#   return(adj_words)
# }
# 
# # Apply the adjective extractor to the text data
# text_data <- reviews$Reviews
# text_data <- gsub("flights", "", text_data)
# text_data <- gsub("plane", "", text_data)
# text_data <- gsub("airport", "", text_data)
# text_data <- gsub("get", "", text_data)
# text_data <- gsub("airline", "", text_data)
# text_data <- gsub("airlines", "", text_data)
# text_data <- gsub("flight", "", text_data)
# 
# adj_text_data <- sapply(text_data, adj_extractor)
# adj_text_data <- unlist(adj_text_data)
# adj_text_data <- gsub("[^[:alnum:][:space:]]*", "", adj_text_data)
# 
# docs <- Corpus(VectorSource(adj_text_data))
# 
# docs <- docs %>%
#   tm_map(removeNumbers) %>%
#   tm_map(removePunctuation) %>%
#   tm_map(stripWhitespace)
# 
# dtm <- TermDocumentMatrix(docs)
# matrix <- as.matrix(dtm)
# words <- sort(rowSums(matrix), decreasing=TRUE)
# df <- data.frame(word = names(words), freq = words)
# 
# set.seed(1234)
# wordcloud(
#   words = df$word,
#   freq = df$freq,
#   min.freq = 1,
#   max.words = 200,
#   random.order = FALSE,
#   rot.per = 0.35,
#   colors = brewer.pal(8, "Dark2")
# )

#install.packages("wordcloud")
library(wordcloud)
## Loading required package: RColorBrewer
#install.packages("RColorBrewer")
library(RColorBrewer)
#install.packages("tm")
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
text_data <- reviews$Reviews
text_data <- gsub("flights","",text_data)
text_data <- gsub("plane", "", text_data)
text_data <- gsub("airport", "", text_data)
text_data <- gsub("get","", text_data)
text_data <- gsub("airline", "", text_data)
text_data <- gsub("airlines","", text_data)
text_data <- gsub("flight", "", text_data)
docs <- Corpus(VectorSource(text_data))


docs <- docs %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., stripWhitespace): transformation drops
## documents
docs <- tm_map(docs, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(docs, content_transformer(tolower)):
## transformation drops documents
docs <- tm_map(docs, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(docs, removeWords, stopwords("english")):
## transformation drops documents
dtm <- TermDocumentMatrix(docs)
matrix <- as.matrix(dtm)
words <- sort(rowSums(matrix),decreasing=TRUE)
df <- data.frame(word = names(words),freq=words)

set.seed(1234)
wordcloud(words = df$word, freq = df$freq, min.freq = 1, max.words=200, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Dark2"))

library(ggplot2)
ggplot(reviews, aes(x = `Type.Of.Traveller`, y = `Total Rating`, col=`Type.Of.Traveller`)) +
   geom_col() +
   xlab("Type of Traveler") +
   ylab("Total Ratings") +
   ggtitle("By travel type")

library(ggplot2)
ggplot(reviews, aes(x = `Seat.Type`, y = `Total Rating`, col=`Seat.Type`)) +
 geom_col() +
  xlab("Seat Type") +
  ylab("Total Ratings") +
  ggtitle("By Seat type")

library(ggplot2)
ggplot(reviews, aes(x = `Verification`, y = `Total Rating`, col=`Total Rating`)) +
 geom_col() +
  xlab("Seat Type") +
  ylab("Total Ratings") +
  ggtitle("By Verification")

type_counts <- table(reviews$Type.Of.Traveller)
type_df <- data.frame(Type.Of.Traveller = names(type_counts),
count = as.numeric(type_counts))
ggplot(type_df, aes(x = "", y = count, fill = Type.Of.Traveller)) +
geom_bar(width = 1, stat = "identity") +
coord_polar("y", start = 0) +
scale_fill_discrete(name = "Type of traveller") +
ggtitle("Type of traveler distribution") +
theme(plot.title = element_text(hjust = 0.5),
axis.line.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
geom_text(aes(label = count), position = position_stack(vjust = 0.5))

library(ggplot2)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
corr_data <- cor(reviews[, c("Seat Comfort", "Cabin Staff Service", "Food & Beverages",
"Inflight Entertainment", "Ground Service", "Wifi & Connectivity")])

melted_corr <- melt(corr_data)
ggplot(melted_corr, aes(x = Var1, y = value, fill = Var2)) +
geom_col() +
scale_fill_discrete(name = "Amenities") +
theme_minimal() +
ggtitle("Correlation between Amenities") +
xlab("Amenities") +
ylab("Correlation") +
theme(plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
axis.title = element_text(size = 14),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.title = element_text(size = 12),
legend.text = element_text(size = 12),
legend.position = "bottom")

library(ggplot2)

# create the plot
ggplot(reviews, aes(x = Verification, fill = Recommended)) +
  geom_bar() +
  scale_fill_discrete(name = "Recommended") +
  theme_minimal() +
  ggtitle("Verification vs Recommended") +
  xlab("Verification") +
  ylab("Count") +
  theme(plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
        axis.title = element_text(size = 14),
        axis.text.x = element_text(angle = 45, hjust = 1),
        legend.title = element_text(size = 12),
        legend.text = element_text(size = 12),
        legend.position = "bottom")

# # Create a box plot for Verification vs Total Ratings
reviews %>% ggplot(aes(Verification, `Total Rating`)) + geom_boxplot() + geom_point(alpha = 0.5, aes(size = `Total Rating`, color = reviews$`Type Of Traveler`))

#Correlation plot between ameneties
library(corrplot)
## corrplot 0.92 loaded
amenities_df <- reviews[, c(5:11)]
col <- colorRampPalette(c("Black", "lightblue", "lightgreen"))

corrplot(
  cor(amenities_df),
  method = "ellipse",
  col = col(500),
  addCoef.col = "black",
  tl.col = "black"
)
## Warning in ind1:ind2: numerical expression has 2 elements: only the first used

pairs(amenities_df, main = "Ameneties", pch = 21, bg = c("#CFB87C"))

library(ggplot2)
ggplot(reviews, aes(x = `Date Flown`, fill = Recommended)) +
geom_bar(position = "dodge") +
scale_fill_manual(values = c("yes" = "green", "no" = "red")) +
labs(x = "Date Flown", y = "Count", fill = "Recommendation")

# K-Means

# install.packages("factoextra")
# load required libraries
# Load required libraries
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
# Prerequisites
Reviews.Labels <- reviews$Type.Of.Traveller
table(Reviews.Labels)
## Reviews.Labels
##       Business Couple Leisure Family Leisure   Solo Leisure 
##            386            460            473            649
reviews_kmeans_clustering <- reviews[5:10]
head(reviews_kmeans_clustering)
##   Seat Comfort Cabin Staff Service Food & Beverages Inflight Entertainment
## 1            1                   2                3                      1
## 2            2                   3                1                      2
## 3            1                   2                3                      1
## 4            1                   2                3                      1
## 5            3                   1                1                      1
## 6            1                   1                1                      2
##   Ground Service Wifi & Connectivity
## 1              2                   3
## 2              3                   1
## 3              2                   1
## 4              1                   1
## 5              1                   1
## 6              3                   4
# Scale the data
reviews_kmeans_clustering_scale <- scale(reviews_kmeans_clustering)
# Calculate K for K-Means using Elbow Method and Silhouette Method
fviz_nbclust(reviews_kmeans_clustering_scale, kmeans, method = "wss") + geom_vline(xintercept = 4, linetype = 2) + labs(subtitle = "Elbow Method")

fviz_nbclust(reviews_kmeans_clustering_scale, kmeans, method = "silhouette") + labs(subtitle = "Silhouette Method")

# Calculate Distance
reviews.dist <- dist(reviews_kmeans_clustering_scale)
# Perform K-Means
km.out <- kmeans(reviews_kmeans_clustering_scale, centers = 4, nstart = 1000)
print(km.out)
## K-means clustering with 4 clusters of sizes 897, 231, 434, 406
## 
## Cluster means:
##   Seat Comfort Cabin Staff Service Food & Beverages Inflight Entertainment
## 1   -0.4127647         -0.50050225       -0.5482564             -0.5569192
## 2    0.3345114          1.12412374        1.8558250             -0.9156574
## 3    1.2273702          0.39093024       -0.5959371              0.1491683
## 4   -0.5903961          0.04831086        0.7924312              1.5919565
##   Ground Service Wifi & Connectivity
## 1     -0.5015258          -0.4017139
## 2     -0.2820461           0.2425646
## 3      0.8787580           1.2184140
## 4      0.3291632          -0.5529230
## 
## Clustering vector:
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
##    1    1    1    1    1    1    1    1    1    2    1    3    1    1    4    1 
##   17   18   19   20   21   22   23   24   25   26   27   28   29   30   31   32 
##    2    1    1    1    1    1    3    2    4    4    3    1    3    2    3    4 
##   33   34   35   36   37   38   39   40   41   42   43   44   45   46   47   48 
##    1    1    1    1    1    1    3    1    3    4    1    1    1    1    1    1 
##   49   50   51   52   53   54   55   56   57   58   59   60   61   62   63   64 
##    1    1    1    4    2    4    2    3    1    1    3    4    3    3    1    1 
##   65   66   67   68   69   70   71   72   73   74   75   76   77   78   79   80 
##    1    2    3    4    3    4    1    2    1    1    3    4    3    4    2    4 
##   81   82   83   84   85   86   87   88   89   90   91   92   93   94   95   96 
##    2    3    4    1    1    1    1    1    3    4    3    1    2    1    2    1 
##   97   98   99  100  101  102  103  104  105  106  107  108  109  110  111  112 
##    1    3    1    1    1    1    1    1    1    1    3    4    3    4    3    2 
##  113  114  115  116  117  118  119  120  121  122  123  124  125  126  127  128 
##    4    3    4    3    1    1    1    1    1    2    3    4    2    2    3    2 
##  129  130  131  132  133  134  135  136  137  138  139  140  141  142  143  144 
##    3    1    3    3    4    3    4    3    3    4    2    1    4    2    3    1 
##  145  146  147  148  149  150  151  152  153  154  155  156  157  158  159  160 
##    3    1    1    1    1    1    2    3    4    3    4    4    3    1    3    1 
##  161  162  163  164  165  166  167  168  169  170  171  172  173  174  175  176 
##    1    1    3    4    3    2    1    1    3    4    3    4    2    1    1    1 
##  177  178  179  180  181  182  183  184  185  186  187  188  189  191  192  193 
##    1    1    1    1    1    4    1    1    1    1    4    3    1    1    1    1 
##  194  195  196  197  198  199  200  201  202  203  204  205  206  207  208  209 
##    1    1    1    1    3    4    2    3    2    1    1    1    1    4    1    3 
##  210  211  212  213  214  215  216  217  218  219  220  221  222  223  224  225 
##    1    4    1    1    3    1    3    4    3    1    1    4    3    4    3    4 
##  226  227  228  229  230  231  232  233  234  235  236  237  238  240  241  242 
##    1    4    4    3    2    3    4    3    4    2    4    2    3    1    3    1 
##  243  244  245  246  247  248  249  250  251  252  253  254  255  256  257  258 
##    3    1    4    3    4    3    4    3    3    2    1    1    1    1    4    3 
##  259  260  261  262  263  264  265  266  267  268  269  270  271  272  273  274 
##    3    4    3    2    3    4    1    1    3    4    2    3    4    1    1    1 
##  275  276  277  278  279  280  281  282  283  284  285  286  287  288  289  290 
##    1    3    4    3    4    1    1    1    3    2    2    1    1    4    4    3 
##  291  292  293  294  295  296  297  298  299  300  301  302  303  304  305  306 
##    4    3    1    4    2    3    4    3    3    3    1    1    4    4    1    1 
##  307  308  309  310  311  312  313  314  315  316  317  318  319  320  321  322 
##    1    1    1    4    3    4    2    3    4    2    1    3    4    3    3    1 
##  323  324  325  326  327  328  329  330  331  332  333  334  335  336  337  338 
##    3    1    1    1    1    1    3    3    3    4    2    3    4    2    4    1 
##  339  340  341  342  343  344  345  346  347  348  349  350  351  352  353  354 
##    1    1    4    4    1    3    4    2    3    2    1    1    1    4    1    1 
##  355  356  357  358  359  360  361  362  363  364  365  366  367  368  369  370 
##    1    4    1    1    3    3    4    1    1    2    1    1    1    1    1    3 
##  371  372  373  374  375  376  377  378  379  380  382  383  384  385  386  387 
##    1    1    1    1    4    3    3    4    1    1    1    1    4    3    4    2 
##  388  389  390  391  392  393  394  395  396  397  398  399  400  401  402  403 
##    1    3    1    4    3    4    2    3    4    4    2    3    2    4    4    1 
##  404  405  406  407  408  409  410  411  412  413  414  415  416  417  418  419 
##    3    1    1    1    1    4    1    1    3    2    4    3    4    2    3    4 
##  420  421  422  423  424  425  426  427  428  429  430  431  432  433  434  435 
##    4    4    1    1    1    2    1    1    1    1    2    3    4    3    4    3 
##  436  437  439  440  441  442  443  444  445  446  447  448  449  450  451  452 
##    4    1    1    3    1    1    3    2    3    3    1    4    2    1    4    3 
##  453  454  455  456  457  458  459  460  461  462  463  464  465  466  467  468 
##    4    2    3    1    1    1    1    1    1    1    1    1    1    3    2    1 
##  469  470  471  472  473  474  475  477  478  479  480  481  482  483  484  485 
##    3    4    3    3    1    1    3    2    1    4    3    4    2    3    1    1 
##  486  487  488  489  490  491  492  493  494  495  496  497  498  499  500  501 
##    1    1    2    3    1    3    2    3    4    1    1    3    1    1    1    1 
##  502  503  504  505  506  507  508  509  510  511  512  513  514  515  516  517 
##    3    1    1    1    4    1    1    2    4    3    2    3    1    3    4    2 
##  518  519  520  521  522  523  524  525  527  528  529  530  531  532  533  534 
##    4    3    1    1    1    1    3    4    3    2    4    1    4    3    4    3 
##  535  536  537  538  539  540  541  542  543  544  545  546  547  548  549  550 
##    3    4    3    4    1    1    1    1    1    4    3    4    2    3    1    4 
##  551  552  553  554  555  556  557  558  559  560  561  562  563  564  565  567 
##    2    4    1    4    2    2    2    4    2    3    1    1    3    4    2    1 
##  568  569  570  571  572  573  574  575  576  577  578  579  580  581  582  583 
##    4    3    1    3    1    1    3    4    2    3    4    3    1    1    1    1 
##  584  585  586  587  588  589  590  591  592  593  594  595  596  597  598  599 
##    2    1    2    3    4    3    4    2    3    4    3    1    1    1    3    1 
##  600  601  602  603  604  605  606  607  608  609  610  611  612  613  614  615 
##    1    1    4    4    3    4    1    1    1    1    1    1    1    1    4    1 
##  616  617  618  619  620  621  622  623  624  625  626  627  628  629  630  631 
##    1    1    3    4    2    3    4    4    1    1    1    1    1    1    3    3 
##  632  633  634  635  636  637  638  639  640  641  642  643  644  645  646  647 
##    4    4    2    4    4    1    1    4    3    4    2    1    1    1    2    3 
##  648  649  650  651  652  653  655  656  657  658  659  660  661  662  663  664 
##    4    3    1    1    1    1    2    1    2    3    3    4    1    1    1    4 
##  665  666  667  668  669  670  671  672  673  674  675  677  678  679  680  681 
##    3    4    2    1    1    1    1    3    2    1    2    4    3    4    1    1 
##  682  683  684  685  686  688  689  690  691  692  693  694  695  696  697  698 
##    1    1    3    4    3    1    3    1    1    1    1    1    1    1    3    2 
##  699  700  701  702  703  704  705  706  707  708  709  710  711  712  713  714 
##    1    1    1    1    1    1    1    3    4    4    1    3    1    1    1    1 
##  715  716  717  718  719  720  721  722  723  724  725  726  727  728  729  730 
##    1    3    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
##  731  732  733  734  735  736  737  738  739  740  741  742  743  744  745  746 
##    1    1    1    1    1    1    1    1    3    1    2    3    1    1    1    4 
##  747  748  749  750  751  752  753  754  755  756  757  758  759  760  761  762 
##    2    1    1    4    3    1    1    1    1    1    4    3    4    2    3    1 
##  763  764  765  766  767  768  769  770  771  772  773  774  775  776  777  778 
##    1    1    1    1    1    1    1    1    4    1    1    2    3    4    3    4 
##  779  780  781  782  783  784  785  786  787  788  789  790  791  792  793  794 
##    2    1    1    1    1    2    1    1    1    1    2    4    1    1    1    2 
##  795  796  797  798  799  800  801  802  803  804  805  806  807  808  809  810 
##    4    3    3    1    1    1    4    3    3    4    1    1    1    1    2    3 
##  811  812  813  814  815  816  817  818  819  820  821  822  823  824  825  826 
##    3    2    2    3    4    3    4    1    1    1    4    1    4    1    3    2 
##  827  828  829  830  831  832  833  834  835  836  837  838  839  840  841  842 
##    4    1    1    1    3    3    1    1    4    3    1    2    1    4    2    1 
##  843  844  845  846  847  848  849  850  851  852  853  854  855  856  857  858 
##    3    4    3    4    2    1    1    1    4    2    1    3    2    3    1    4 
##  859  860  861  862  863  864  865  866  867  868  869  870  871  872  873  874 
##    1    1    1    1    1    3    1    1    1    2    4    3    4    1    2    4 
##  875  876  877  878  879  880  881  882  883  884  885  886  887  888  889  890 
##    1    1    2    4    1    1    2    3    2    3    4    1    1    1    1    2 
##  891  892  893  894  895  896  897  898  899  900  901  902  903  904  905  906 
##    3    3    2    1    3    4    3    4    3    4    2    3    4    3    4    1 
##  907  908  909  910  911  912  913  914  915  916  917  918  919  920  921  922 
##    1    2    3    1    1    1    1    4    2    3    3    1    1    1    3    4 
##  923  924  925  926  927  928  929  930  931  932  933  934  935  936  937  938 
##    2    1    1    2    1    1    1    1    1    1    1    1    1    1    1    1 
##  939  940  941  942  943  944  945  946  947  948  949  950  951  952  953  954 
##    1    3    4    3    4    3    1    1    1    1    1    1    3    2    1    1 
##  955  956  957  958  959  960  961  962  963  964  965  966  967  968  969  970 
##    3    1    1    1    1    1    1    1    1    1    1    1    1    4    4    2 
##  971  972  973  974  975  976  977  978  979  980  981  982  983  984  985  986 
##    1    1    1    1    1    1    3    2    3    2    1    2    4    2    4    3 
##  987  988  989  990  991  992  993  994  995  996  997  998  999 1000 1001 1002 
##    1    1    4    3    1    1    1    3    4    4    1    1    4    3    4    1 
## 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 
##    1    1    1    1    1    2    3    3    3    4    3    4    3    1    1    1 
## 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 
##    1    1    1    1    1    1    3    2    4    3    4    2    3    4    1    3 
## 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 
##    3    4    3    1    1    1    1    1    1    3    1    1    1    2    2    1 
## 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 
##    1    1    1    1    1    1    1    2    1    3    4    2    3    1    3    4 
## 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 
##    3    1    1    1    2    1    1    1    1    1    3    1    1    1    1    4 
## 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 
##    1    1    1    1    1    1    1    4    1    3    4    3    2    3    2    3 
## 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 
##    4    3    2    4    1    4    1    1    1    1    1    1    1    3    4    2 
## 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 
##    4    4    2    2    1    1    1    1    2    1    1    3    2    1    1    2 
## 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 
##    1    2    4    4    4    4    1    3    4    3    4    3    1    1    1    1 
## 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 
##    3    4    3    2    3    1    2    2    1    1    4    3    4    2    1    3 
## 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 
##    3    4    4    4    1    1    1    4    2    3    4    1    4    1    1    4 
## 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 
##    1    1    2    3    3    1    1    3    3    4    3    2    3    4    3    4 
## 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 
##    2    3    4    3    1    1    1    1    3    1    1    3    1    2    4    4 
## 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 
##    1    1    1    4    2    3    4    3    1    3    2    1    1    1    2    3 
## 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 
##    4    3    4    1    1    2    3    3    4    3    2    3    4    1    3    1 
## 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 
##    3    3    1    1    1    1    4    1    2    1    1    1    4    3    4    2 
## 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 
##    3    1    1    1    4    4    3    4    2    2    4    1    4    1    4    3 
## 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 
##    1    2    4    1    1    1    3    1    3    2    1    1    1    1    1    1 
## 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 
##    1    1    1    1    1    1    1    3    1    3    1    3    2    4    1    1 
## 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 
##    1    1    4    1    3    3    4    1    1    1    3    1    1    1    1    4 
## 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1338 1339 
##    1    1    1    1    4    1    4    1    1    1    1    1    1    1    3    4 
## 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 
##    3    4    4    4    4    1    1    1    1    1    1    1    1    3    1    4 
## 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 
##    4    1    1    4    3    1    1    1    1    1    1    1    1    1    1    1 
## 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 
##    4    1    1    3    1    3    1    1    1    2    1    1    1    1    4    2 
## 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 
##    4    1    3    1    1    1    1    2    4    3    1    1    1    1    2    4 
## 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 
##    3    4    2    4    1    1    1    1    4    1    1    3    1    2    1    3 
## 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 
##    4    2    1    1    1    1    4    4    1    1    1    2    1    1    4    1 
## 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 
##    2    3    4    1    1    4    3    4    3    1    2    4    1    2    3    4 
## 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 
##    1    1    1    1    1    4    1    1    1    1    1    3    1    1    1    1 
## 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 
##    2    1    1    1    2    3    4    3    1    1    4    1    1    1    1    4 
## 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 
##    3    4    2    1    1    1    1    1    2    3    4    3    1    1    4    3 
## 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 
##    4    3    4    3    4    3    4    3    3    3    2    3    3    4    1    1 
## 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 
##    3    2    3    4    3    3    4    1    1    1    1    1    1    1    3    1 
## 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 
##    1    1    1    4    1    1    1    2    3    1    1    1    1    3    3    4 
## 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 
##    1    3    4    3    4    3    4    1    1    3    4    3    2    3    4    3 
## 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 
##    4    2    1    1    1    1    4    1    3    1    2    1    3    4    3    4 
## 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 
##    2    1    1    1    4    2    2    2    3    4    3    4    3    1    2    3 
## 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 
##    1    4    3    1    1    1    1    1    4    1    3    2    3    4    4    2 
## 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 
##    4    3    3    1    3    4    2    3    4    3    4    2    3    4    3    1 
## 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 
##    1    1    4    2    1    3    3    4    1    4    2    4    3    1    1    3 
## 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 
##    3    4    1    2    3    4    3    2    1    4    1    3    4    4    1    1 
## 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 
##    2    3    2    1    1    4    4    1    1    1    1    1    1    1    4    2 
## 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 
##    4    4    4    3    1    4    2    1    3    4    3    1    1    1    4    2 
## 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 
##    3    4    4    2    1    3    1    3    3    1    3    4    3    1    1    4 
## 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 
##    3    2    1    2    1    1    3    1    4    3    4    1    3    2    4    4 
## 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 
##    1    3    3    4    1    4    3    3    4    1    1    1    1    1    1    3 
## 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1753 1754 1755 1756 
##    4    4    2    3    2    3    4    1    1    1    1    1    4    3    4    3 
## 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 
##    4    3    1    3    1    1    1    1    4    3    2    4    1    1    2    3 
## 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 
##    4    4    3    4    2    1    1    4    4    1    3    4    2    3    4    1 
## 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 
##    3    4    1    4    1    1    1    1    1    1    2    1    1    1    1    1 
## 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 
##    1    1    1    2    4    3    2    2    1    1    1    4    2    3    4    2 
## 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 
##    3    1    4    3    2    3    1    4    1    1    1    1    1    1    1    3 
## 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 
##    4    3    3    4    2    3    4    3    4    2    1    1    1    1    4    3 
## 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 
##    4    2    2    3    4    4    1    1    1    1    3    3    3    4    3    4 
## 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 
##    2    3    4    3    4    2    3    4    3    4    1    2    1    3    4    2 
## 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 
##    4    4    4    3    3    4    2    3    4    3    1    3    4    1    1    4 
## 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 
##    3    4    3    4    2    4    3    1    3    3    2    3    4    3    4    2 
## 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 
##    4    1    1    3    1    4    1    3    1    1    2    3    4    2    3    3 
## 1933 1934 1935 1936 1937 1938 1939 1940 1942 1943 1944 1945 1946 1947 1948 1949 
##    4    3    4    2    1    3    4    3    1    3    4    2    1    1    1    4 
## 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 
##    1    1    1    1    4    2    3    4    1    1    1    1    1    1    1    1 
## 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1977 1978 1979 1980 1981 1982 
##    1    1    1    4    3    4    2    3    3    4    3    4    2    3    4    2 
## 
## Within cluster sum of squares by cluster:
## [1] 2148.4354  225.1447 2014.8653 1321.0365
##  (between_SS / total_SS =  51.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
# 51.6 % classification or identification of an observation with a group - could be better.
# Creating a new dataframe to assign the values of Type of Traveler
reviews_tot <- reviews[1]
names(reviews_tot) <- c("Type.Of.Traveler")
reviews_tot$Type.Of.Traveler <- substr(reviews_tot$Type.Of.Traveler, 1, 1)

rownames(reviews_kmeans_clustering_scale) <- paste(reviews_tot$Type.Of.Traveler, 1:dim(reviews_tot)[1], sep = "_")      

head(reviews_kmeans_clustering_scale)
##     Seat Comfort Cabin Staff Service Food & Beverages Inflight Entertainment
## B_1   -0.9268708          -0.1363212        0.6099306             -0.9156574
## S_2   -0.1372250           0.6527378       -0.9499698             -0.1402642
## C_3   -0.9268708          -0.1363212        0.6099306             -0.9156574
## C_4   -0.9268708          -0.1363212        0.6099306             -0.9156574
## S_5    0.6524207          -0.9253802       -0.9499698             -0.9156574
## F_6   -0.9268708          -0.9253802       -0.9499698             -0.1402642
##     Ground Service Wifi & Connectivity
## B_1     -0.1174966           0.6526826
## S_2      0.6743981          -0.9262718
## C_3     -0.1174966          -0.9262718
## C_4     -0.9093913          -0.9262718
## S_5     -0.9093913          -0.9262718
## F_6      0.6743981           1.4421598
km.out <- kmeans(reviews_kmeans_clustering_scale, centers = 3, nstart = 100)

km.clusters <- km.out$clusters

p1 <- fviz_cluster(list(data=reviews_kmeans_clustering_scale, cluster = km.out$cluster)) + ggtitle("k=3")

km.out <- kmeans(reviews_kmeans_clustering_scale, centers = 4, nstart = 100)

km.clusters <- km.out$clusters

p2 <- fviz_cluster(list(data=reviews_kmeans_clustering_scale, cluster = km.out$cluster)) + ggtitle("k=4")

km.out <- kmeans(reviews_kmeans_clustering_scale, centers = 7, nstart = 100)

km.clusters <- km.out$clusters

p3 <- fviz_cluster(list(data=reviews_kmeans_clustering_scale, cluster = km.out$cluster)) + ggtitle("k=7")

km.out <- kmeans(reviews_kmeans_clustering_scale, centers = 8, nstart = 100)

km.clusters <- km.out$clusters

p4 <- fviz_cluster(list(data=reviews_kmeans_clustering_scale, cluster = km.out$cluster)) + ggtitle("k=8")

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(p1,p2,p3,p4, nrow=2)

Hierarchical Clustering

# Prerequisites
Reviews.Labels <- reviews$Type.Of.Traveller
table(Reviews.Labels)
## Reviews.Labels
##       Business Couple Leisure Family Leisure   Solo Leisure 
##            386            460            473            649
reviews_hierarchical_clustering <- reviews[5:11]
# Scale the data
reviews_hierarchical_clustering_scale <- scale(reviews_hierarchical_clustering)
# Distance Calculating using Cosine Similarity
cosine_matrix <- as.matrix(reviews_hierarchical_clustering_scale)
reviews_cosine_distance <- 1 - crossprod(cosine_matrix) / sqrt(colSums(cosine_matrix^2) %*% t(colSums(cosine_matrix^2)))
hc.out_reviews <- hclust(as.dist(reviews_cosine_distance), method = "ward.D")
# hc.out_reviews
hierarchical.dist <- as.dist(reviews_cosine_distance)
# Dendogram
hc.out_reviews <- hclust(hierarchical.dist, method = "ward.D")
plot(hc.out_reviews)
rect.hclust(hc.out_reviews, k=3, border = 2:5)

Association Rule Mining

#install.packages("arules")
#install.packages("arulesViz")
library(arules)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Attaching package: 'arules'
## The following object is masked from 'package:tm':
## 
##     inspect
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
library(arulesViz)
reviews_apriori <- reviews[, c(1,2,4,13)]
summary(reviews_apriori)
##  Type.Of.Traveller   Seat.Type         Recommended         Total Rating  
##  Length:1968        Length:1968        Length:1968        Min.   :1.000  
##  Class :character   Class :character   Class :character   1st Qu.:1.714  
##  Mode  :character   Mode  :character   Mode  :character   Median :2.286  
##                                                           Mean   :2.178  
##                                                           3rd Qu.:2.714  
##                                                           Max.   :3.429
rule1 <- apriori(reviews_apriori, parameter=list(suppor = 0.002, confidence = 0.5)) 
## Warning: Column(s) 1, 2, 3, 4 not logical or factor. Applying default
## discretization (see '? discretizeDF').
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.5    0.1    1 none FALSE            TRUE       5   0.002      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 3 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[13 item(s), 1968 transaction(s)] done [0.00s].
## sorting and recoding items ... [13 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [167 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
inspect(head(sort(rule1, by = "lift"), 20))
##      lhs                                    rhs                                  support confidence    coverage     lift count
## [1]  {Type.Of.Traveller=Family Leisure,                                                                                       
##       Seat.Type=Business Class,                                                                                               
##       Total Rating=[2.57,3.43]}          => {Recommended=yes}                0.003048780  1.0000000 0.003048780 3.352641     6
## [2]  {Type.Of.Traveller=Solo Leisure,                                                                                         
##       Seat.Type=First Class,                                                                                                  
##       Total Rating=[2.57,3.43]}          => {Recommended=yes}                0.006097561  0.6315789 0.009654472 2.117457    12
## [3]  {Type.Of.Traveller=Family Leisure,                                                                                       
##       Seat.Type=Business Class,                                                                                               
##       Recommended=yes}                   => {Total Rating=[2.57,3.43]}       0.003048780  0.7500000 0.004065041 2.058577     6
## [4]  {Type.Of.Traveller=Solo Leisure,                                                                                         
##       Seat.Type=Premium Economy,                                                                                              
##       Total Rating=[1,1.86)}             => {Recommended=yes}                0.003048780  0.6000000 0.005081301 2.011584     6
## [5]  {Type.Of.Traveller=Family Leisure,                                                                                       
##       Seat.Type=Business Class,                                                                                               
##       Recommended=no}                    => {Total Rating=[1,1.86)}          0.002032520  0.5714286 0.003556911 1.955776     4
## [6]  {Type.Of.Traveller=Business,                                                                                             
##       Seat.Type=Business Class,                                                                                               
##       Total Rating=[1,1.86)}             => {Recommended=yes}                0.002032520  0.5714286 0.003556911 1.915795     4
## [7]  {Type.Of.Traveller=Family Leisure,                                                                                       
##       Seat.Type=First Class,                                                                                                  
##       Recommended=yes}                   => {Total Rating=[2.57,3.43]}       0.002032520  0.6666667 0.003048780 1.829847     4
## [8]  {Type.Of.Traveller=Couple Leisure,                                                                                       
##       Seat.Type=Premium Economy,                                                                                              
##       Total Rating=[1,1.86)}             => {Recommended=yes}                0.003556911  0.5384615 0.006605691 1.805268     7
## [9]  {Type.Of.Traveller=Family Leisure,                                                                                       
##       Seat.Type=Business Class}          => {Recommended=yes}                0.004065041  0.5333333 0.007621951 1.788075     8
## [10] {Type.Of.Traveller=Solo Leisure,                                                                                         
##       Seat.Type=First Class}             => {Recommended=yes}                0.010670732  0.5121951 0.020833333 1.717206    21
## [11] {Type.Of.Traveller=Family Leisure,                                                                                       
##       Seat.Type=First Class,                                                                                                  
##       Total Rating=[2.57,3.43]}          => {Recommended=yes}                0.002032520  0.5000000 0.004065041 1.676320     4
## [12] {Type.Of.Traveller=Solo Leisure,                                                                                         
##       Seat.Type=First Class,                                                                                                  
##       Total Rating=[1,1.86)}             => {Recommended=yes}                0.002540650  0.5000000 0.005081301 1.676320     5
## [13] {Type.Of.Traveller=Solo Leisure,                                                                                         
##       Seat.Type=Premium Economy,                                                                                              
##       Recommended=yes}                   => {Total Rating=[1.86,2.57)}       0.004573171  0.5625000 0.008130081 1.637574     9
## [14] {Type.Of.Traveller=Solo Leisure,                                                                                         
##       Seat.Type=First Class,                                                                                                  
##       Recommended=yes}                   => {Total Rating=[2.57,3.43]}       0.006097561  0.5714286 0.010670732 1.568440    12
## [15] {Seat.Type=First Class,                                                                                                  
##       Recommended=yes,                                                                                                        
##       Total Rating=[2.57,3.43]}          => {Type.Of.Traveller=Solo Leisure} 0.006097561  0.5000000 0.012195122 1.516179    12
## [16] {Type.Of.Traveller=Family Leisure,                                                                                       
##       Seat.Type=Business Class,                                                                                               
##       Total Rating=[1,1.86)}             => {Recommended=no}                 0.002032520  1.0000000 0.002032520 1.425054     4
## [17] {Type.Of.Traveller=Family Leisure,                                                                                       
##       Seat.Type=First Class,                                                                                                  
##       Total Rating=[1,1.86)}             => {Recommended=no}                 0.004065041  1.0000000 0.004065041 1.425054     8
## [18] {Type.Of.Traveller=Solo Leisure,                                                                                         
##       Seat.Type=Premium Economy,                                                                                              
##       Recommended=no}                    => {Total Rating=[2.57,3.43]}       0.007621951  0.5172414 0.014735772 1.419709    15
## [19] {Type.Of.Traveller=Solo Leisure,                                                                                         
##       Seat.Type=Premium Economy,                                                                                              
##       Total Rating=[2.57,3.43]}          => {Recommended=no}                 0.007621951  0.9375000 0.008130081 1.335988    15
## [20] {Type.Of.Traveller=Family Leisure,                                                                                       
##       Recommended=yes,                                                                                                        
##       Total Rating=[1,1.86)}             => {Seat.Type=Economy Class}        0.014227642  0.9655172 0.014735772 1.241109    28
plot(rule1)
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.

plot(rule1, method = "grouped")

inspect(head(sort(rule1, by = "confidence"), 20))
##      lhs                                    rhs                           support confidence    coverage     lift count
## [1]  {Type.Of.Traveller=Family Leisure,                                                                                
##       Seat.Type=Business Class,                                                                                        
##       Total Rating=[1,1.86)}             => {Recommended=no}          0.002032520  1.0000000 0.002032520 1.425054     4
## [2]  {Type.Of.Traveller=Family Leisure,                                                                                
##       Seat.Type=Business Class,                                                                                        
##       Total Rating=[2.57,3.43]}          => {Recommended=yes}         0.003048780  1.0000000 0.003048780 3.352641     6
## [3]  {Type.Of.Traveller=Family Leisure,                                                                                
##       Seat.Type=First Class,                                                                                           
##       Total Rating=[1,1.86)}             => {Recommended=no}          0.004065041  1.0000000 0.004065041 1.425054     8
## [4]  {Type.Of.Traveller=Family Leisure,                                                                                
##       Recommended=yes,                                                                                                 
##       Total Rating=[1,1.86)}             => {Seat.Type=Economy Class} 0.014227642  0.9655172 0.014735772 1.241109    28
## [5]  {Type.Of.Traveller=Solo Leisure,                                                                                  
##       Seat.Type=Premium Economy,                                                                                       
##       Total Rating=[2.57,3.43]}          => {Recommended=no}          0.007621951  0.9375000 0.008130081 1.335988    15
## [6]  {Type.Of.Traveller=Family Leisure,                                                                                
##       Recommended=no,                                                                                                  
##       Total Rating=[2.57,3.43]}          => {Seat.Type=Economy Class} 0.057926829  0.8837209 0.065548780 1.135965   114
## [7]  {Type.Of.Traveller=Solo Leisure,                                                                                  
##       Recommended=no,                                                                                                  
##       Total Rating=[1,1.86)}             => {Seat.Type=Economy Class} 0.060467480  0.8686131 0.069613821 1.116545   119
## [8]  {Type.Of.Traveller=Family Leisure,                                                                                
##       Recommended=no,                                                                                                  
##       Total Rating=[1.86,2.57)}          => {Seat.Type=Economy Class} 0.056910569  0.8682171 0.065548780 1.116036   112
## [9]  {Type.Of.Traveller=Family Leisure,                                                                                
##       Total Rating=[1,1.86)}             => {Seat.Type=Economy Class} 0.055386179  0.8650794 0.064024390 1.112003   109
## [10] {Type.Of.Traveller=Family Leisure,                                                                                
##       Recommended=no}                    => {Seat.Type=Economy Class} 0.155995935  0.8647887 0.180386179 1.111629   307
## [11] {Type.Of.Traveller=Family Leisure,                                                                                
##       Total Rating=[1.86,2.57)}          => {Seat.Type=Economy Class} 0.071646341  0.8545455 0.083841463 1.098462   141
## [12] {Type.Of.Traveller=Family Leisure}  => {Seat.Type=Economy Class} 0.205284553  0.8541226 0.240345528 1.097919   404
## [13] {Type.Of.Traveller=Family Leisure,                                                                                
##       Total Rating=[2.57,3.43]}          => {Seat.Type=Economy Class} 0.078252033  0.8461538 0.092479675 1.087675   154
## [14] {Recommended=no,                                                                                                  
##       Total Rating=[1,1.86)}             => {Seat.Type=Economy Class} 0.170731707  0.8358209 0.204268293 1.074393   336
## [15] {Type.Of.Traveller=Family Leisure,                                                                                
##       Recommended=no,                                                                                                  
##       Total Rating=[1,1.86)}             => {Seat.Type=Economy Class} 0.041158537  0.8350515 0.049288618 1.073404    81
## [16] {Type.Of.Traveller=Solo Leisure,                                                                                  
##       Total Rating=[1,1.86)}             => {Seat.Type=Economy Class} 0.085365854  0.8316832 0.102642276 1.069074   168
## [17] {Type.Of.Traveller=Couple Leisure,                                                                                
##       Recommended=no,                                                                                                  
##       Total Rating=[1,1.86)}             => {Seat.Type=Economy Class} 0.042174797  0.8300000 0.050813008 1.066911    83
## [18] {Type.Of.Traveller=Solo Leisure,                                                                                  
##       Recommended=no}                    => {Seat.Type=Economy Class} 0.182926829  0.8275862 0.221036585 1.063808   360
## [19] {Type.Of.Traveller=Solo Leisure,                                                                                  
##       Recommended=no,                                                                                                  
##       Total Rating=[1.86,2.57)}          => {Seat.Type=Economy Class} 0.057926829  0.8260870 0.070121951 1.061881   114
## [20] {Type.Of.Traveller=Couple Leisure,                                                                                
##       Seat.Type=First Class,                                                                                           
##       Total Rating=[2.57,3.43]}          => {Recommended=no}          0.007113821  0.8235294 0.008638211 1.173574    14
inspect(head(sort(rule1, by = "support"), 20))
##      lhs                                   rhs                         support confidence  coverage      lift count
## [1]  {}                                 => {Seat.Type=Economy Class} 0.7779472  0.7779472 1.0000000 1.0000000  1531
## [2]  {}                                 => {Recommended=no}          0.7017276  0.7017276 1.0000000 1.0000000  1381
## [3]  {Recommended=no}                   => {Seat.Type=Economy Class} 0.5579268  0.7950760 0.7017276 1.0220180  1098
## [4]  {Seat.Type=Economy Class}          => {Recommended=no}          0.5579268  0.7171783 0.7779472 1.0220180  1098
## [5]  {Total Rating=[2.57,3.43]}         => {Seat.Type=Economy Class} 0.2799797  0.7684798 0.3643293 0.9878303   551
## [6]  {Type.Of.Traveller=Solo Leisure}   => {Seat.Type=Economy Class} 0.2652439  0.8043143 0.3297764 1.0338933   522
## [7]  {Total Rating=[1.86,2.57)}         => {Seat.Type=Economy Class} 0.2616870  0.7618343 0.3434959 0.9792880   515
## [8]  {Total Rating=[2.57,3.43]}         => {Recommended=no}          0.2535569  0.6959554 0.3643293 0.9917742   499
## [9]  {Total Rating=[1.86,2.57)}         => {Recommended=no}          0.2439024  0.7100592 0.3434959 1.0118729   480
## [10] {Total Rating=[1,1.86)}            => {Seat.Type=Economy Class} 0.2362805  0.8086957 0.2921748 1.0395252   465
## [11] {Type.Of.Traveller=Solo Leisure}   => {Recommended=no}          0.2210366  0.6702619 0.3297764 0.9551597   435
## [12] {Recommended=yes}                  => {Seat.Type=Economy Class} 0.2200203  0.7376491 0.2982724 0.9481994   433
## [13] {Type.Of.Traveller=Family Leisure} => {Seat.Type=Economy Class} 0.2052846  0.8541226 0.2403455 1.0979186   404
## [14] {Total Rating=[1,1.86)}            => {Recommended=no}          0.2042683  0.6991304 0.2921748 0.9962988   402
## [15] {Recommended=no,                                                                                              
##       Total Rating=[2.57,3.43]}         => {Seat.Type=Economy Class} 0.1986789  0.7835671 0.2535569 1.0072241   391
## [16] {Seat.Type=Economy Class,                                                                                     
##       Total Rating=[2.57,3.43]}         => {Recommended=no}          0.1986789  0.7096189 0.2799797 1.0112454   391
## [17] {Recommended=no,                                                                                              
##       Total Rating=[1.86,2.57)}         => {Seat.Type=Economy Class} 0.1885163  0.7729167 0.2439024 0.9935336   371
## [18] {Seat.Type=Economy Class,                                                                                     
##       Total Rating=[1.86,2.57)}         => {Recommended=no}          0.1885163  0.7203883 0.2616870 1.0265925   371
## [19] {Type.Of.Traveller=Solo Leisure,                                                                              
##       Recommended=no}                   => {Seat.Type=Economy Class} 0.1829268  0.8275862 0.2210366 1.0638077   360
## [20] {Type.Of.Traveller=Solo Leisure,                                                                              
##       Seat.Type=Economy Class}          => {Recommended=no}          0.1829268  0.6896552 0.2652439 0.9827961   360
rule2 <- apriori(reviews_apriori, parameter=list(suppor = 0.005, confidence = 0.75)) 
## Warning: Column(s) 1, 2, 3, 4 not logical or factor. Applying default
## discretization (see '? discretizeDF').
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##        0.75    0.1    1 none FALSE            TRUE       5   0.005      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 9 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[13 item(s), 1968 transaction(s)] done [0.00s].
## sorting and recoding items ... [13 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [49 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
inspect(head(sort(rule2, by = "lift"), 20))
##      lhs                                    rhs                           support confidence    coverage     lift count
## [1]  {Type.Of.Traveller=Solo Leisure,                                                                                  
##       Seat.Type=Premium Economy,                                                                                       
##       Total Rating=[2.57,3.43]}          => {Recommended=no}          0.007621951  0.9375000 0.008130081 1.335988    15
## [2]  {Type.Of.Traveller=Family Leisure,                                                                                
##       Recommended=yes,                                                                                                 
##       Total Rating=[1,1.86)}             => {Seat.Type=Economy Class} 0.014227642  0.9655172 0.014735772 1.241109    28
## [3]  {Type.Of.Traveller=Couple Leisure,                                                                                
##       Seat.Type=First Class,                                                                                           
##       Total Rating=[2.57,3.43]}          => {Recommended=no}          0.007113821  0.8235294 0.008638211 1.173574    14
## [4]  {Type.Of.Traveller=Family Leisure,                                                                                
##       Recommended=no,                                                                                                  
##       Total Rating=[2.57,3.43]}          => {Seat.Type=Economy Class} 0.057926829  0.8837209 0.065548780 1.135965   114
## [5]  {Type.Of.Traveller=Family Leisure,                                                                                
##       Seat.Type=Economy Class,                                                                                         
##       Total Rating=[1.86,2.57)}          => {Recommended=no}          0.056910569  0.7943262 0.071646341 1.131958   112
## [6]  {Type.Of.Traveller=Family Leisure,                                                                                
##       Seat.Type=Premium Economy,                                                                                       
##       Total Rating=[2.57,3.43]}          => {Recommended=no}          0.005589431  0.7857143 0.007113821 1.119686    11
## [7]  {Type.Of.Traveller=Solo Leisure,                                                                                  
##       Recommended=no,                                                                                                  
##       Total Rating=[1,1.86)}             => {Seat.Type=Economy Class} 0.060467480  0.8686131 0.069613821 1.116545   119
## [8]  {Type.Of.Traveller=Family Leisure,                                                                                
##       Recommended=no,                                                                                                  
##       Total Rating=[1.86,2.57)}          => {Seat.Type=Economy Class} 0.056910569  0.8682171 0.065548780 1.116036   112
## [9]  {Type.Of.Traveller=Business,                                                                                      
##       Seat.Type=First Class,                                                                                           
##       Total Rating=[1.86,2.57)}          => {Recommended=no}          0.009146341  0.7826087 0.011686992 1.115260    18
## [10] {Type.Of.Traveller=Family Leisure,                                                                                
##       Total Rating=[1.86,2.57)}          => {Recommended=no}          0.065548780  0.7818182 0.083841463 1.114133   129
## [11] {Type.Of.Traveller=Family Leisure,                                                                                
##       Total Rating=[1,1.86)}             => {Seat.Type=Economy Class} 0.055386179  0.8650794 0.064024390 1.112003   109
## [12] {Type.Of.Traveller=Family Leisure,                                                                                
##       Recommended=no}                    => {Seat.Type=Economy Class} 0.155995935  0.8647887 0.180386179 1.111629   307
## [13] {Type.Of.Traveller=Family Leisure,                                                                                
##       Total Rating=[1.86,2.57)}          => {Seat.Type=Economy Class} 0.071646341  0.8545455 0.083841463 1.098462   141
## [14] {Type.Of.Traveller=Family Leisure}  => {Seat.Type=Economy Class} 0.205284553  0.8541226 0.240345528 1.097919   404
## [15] {Type.Of.Traveller=Family Leisure,                                                                                
##       Total Rating=[1,1.86)}             => {Recommended=no}          0.049288618  0.7698413 0.064024390 1.097066    97
## [16] {Type.Of.Traveller=Family Leisure,                                                                                
##       Seat.Type=Premium Economy}         => {Recommended=no}          0.011686992  0.7666667 0.015243902 1.092542    23
## [17] {Type.Of.Traveller=Family Leisure,                                                                                
##       Total Rating=[2.57,3.43]}          => {Seat.Type=Economy Class} 0.078252033  0.8461538 0.092479675 1.087675   154
## [18] {Type.Of.Traveller=Family Leisure,                                                                                
##       Seat.Type=Economy Class}           => {Recommended=no}          0.155995935  0.7599010 0.205284553 1.082900   307
## [19] {Seat.Type=Premium Economy,                                                                                       
##       Total Rating=[2.57,3.43]}          => {Recommended=no}          0.022357724  0.7586207 0.029471545 1.081076    44
## [20] {Recommended=no,                                                                                                  
##       Total Rating=[1,1.86)}             => {Seat.Type=Economy Class} 0.170731707  0.8358209 0.204268293 1.074393   336
inspect(head(sort(rule2, by = "confidence"), 20))
##      lhs                                    rhs                           support confidence    coverage     lift count
## [1]  {Type.Of.Traveller=Family Leisure,                                                                                
##       Recommended=yes,                                                                                                 
##       Total Rating=[1,1.86)}             => {Seat.Type=Economy Class} 0.014227642  0.9655172 0.014735772 1.241109    28
## [2]  {Type.Of.Traveller=Solo Leisure,                                                                                  
##       Seat.Type=Premium Economy,                                                                                       
##       Total Rating=[2.57,3.43]}          => {Recommended=no}          0.007621951  0.9375000 0.008130081 1.335988    15
## [3]  {Type.Of.Traveller=Family Leisure,                                                                                
##       Recommended=no,                                                                                                  
##       Total Rating=[2.57,3.43]}          => {Seat.Type=Economy Class} 0.057926829  0.8837209 0.065548780 1.135965   114
## [4]  {Type.Of.Traveller=Solo Leisure,                                                                                  
##       Recommended=no,                                                                                                  
##       Total Rating=[1,1.86)}             => {Seat.Type=Economy Class} 0.060467480  0.8686131 0.069613821 1.116545   119
## [5]  {Type.Of.Traveller=Family Leisure,                                                                                
##       Recommended=no,                                                                                                  
##       Total Rating=[1.86,2.57)}          => {Seat.Type=Economy Class} 0.056910569  0.8682171 0.065548780 1.116036   112
## [6]  {Type.Of.Traveller=Family Leisure,                                                                                
##       Total Rating=[1,1.86)}             => {Seat.Type=Economy Class} 0.055386179  0.8650794 0.064024390 1.112003   109
## [7]  {Type.Of.Traveller=Family Leisure,                                                                                
##       Recommended=no}                    => {Seat.Type=Economy Class} 0.155995935  0.8647887 0.180386179 1.111629   307
## [8]  {Type.Of.Traveller=Family Leisure,                                                                                
##       Total Rating=[1.86,2.57)}          => {Seat.Type=Economy Class} 0.071646341  0.8545455 0.083841463 1.098462   141
## [9]  {Type.Of.Traveller=Family Leisure}  => {Seat.Type=Economy Class} 0.205284553  0.8541226 0.240345528 1.097919   404
## [10] {Type.Of.Traveller=Family Leisure,                                                                                
##       Total Rating=[2.57,3.43]}          => {Seat.Type=Economy Class} 0.078252033  0.8461538 0.092479675 1.087675   154
## [11] {Recommended=no,                                                                                                  
##       Total Rating=[1,1.86)}             => {Seat.Type=Economy Class} 0.170731707  0.8358209 0.204268293 1.074393   336
## [12] {Type.Of.Traveller=Family Leisure,                                                                                
##       Recommended=no,                                                                                                  
##       Total Rating=[1,1.86)}             => {Seat.Type=Economy Class} 0.041158537  0.8350515 0.049288618 1.073404    81
## [13] {Type.Of.Traveller=Solo Leisure,                                                                                  
##       Total Rating=[1,1.86)}             => {Seat.Type=Economy Class} 0.085365854  0.8316832 0.102642276 1.069074   168
## [14] {Type.Of.Traveller=Couple Leisure,                                                                                
##       Recommended=no,                                                                                                  
##       Total Rating=[1,1.86)}             => {Seat.Type=Economy Class} 0.042174797  0.8300000 0.050813008 1.066911    83
## [15] {Type.Of.Traveller=Solo Leisure,                                                                                  
##       Recommended=no}                    => {Seat.Type=Economy Class} 0.182926829  0.8275862 0.221036585 1.063808   360
## [16] {Type.Of.Traveller=Solo Leisure,                                                                                  
##       Recommended=no,                                                                                                  
##       Total Rating=[1.86,2.57)}          => {Seat.Type=Economy Class} 0.057926829  0.8260870 0.070121951 1.061881   114
## [17] {Type.Of.Traveller=Couple Leisure,                                                                                
##       Seat.Type=First Class,                                                                                           
##       Total Rating=[2.57,3.43]}          => {Recommended=no}          0.007113821  0.8235294 0.008638211 1.173574    14
## [18] {Type.Of.Traveller=Family Leisure,                                                                                
##       Recommended=yes}                   => {Seat.Type=Economy Class} 0.049288618  0.8220339 0.059959350 1.056671    97
## [19] {Total Rating=[1,1.86)}             => {Seat.Type=Economy Class} 0.236280488  0.8086957 0.292174797 1.039525   465
## [20] {Type.Of.Traveller=Family Leisure,                                                                                
##       Recommended=yes,                                                                                                 
##       Total Rating=[1.86,2.57)}          => {Seat.Type=Economy Class} 0.014735772  0.8055556 0.018292683 1.035489    29
plot(rule2)

plot(rule2, method = "grouped")

#Naive Bayesian

# Load required libraries
library(e1071)
## Registered S3 methods overwritten by 'proxy':
##   method               from    
##   print.registry_field registry
##   print.registry_entry registry
library(caTools)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(dplyr)
library(gridExtra)
library(class)
reviews <- read.csv("/Users/rahul.chauhan/Desktop/airline_reviews_cleaned.csv", stringsAsFactors = TRUE)

# Select relevant features
features <- c("Type.Of.Traveler", "Seat.Type", "Seat.Comfort", "Cabin.Staff.Services", "Food.Beverages", "Inflight.Entertainment", "Ground.Service", "Wifi", "Value.For.Money")
# Create formula
formula <- as.formula(paste("Recommended ~", paste(features, collapse = "+")))

# Split data into training and testing sets
set.seed(123)
trainIndex <- createDataPartition(reviews$Recommended, p = 0.8, list = FALSE)
trainData <- reviews[trainIndex, ]
testData <- reviews[-trainIndex, ]

# Train Naive Bayes classifier
nb_model <- naiveBayes(formula, data = trainData)

# Make predictions on test data
predictions <- predict(nb_model, testData[, features])
testData$Recommended <- as.factor(testData$Recommended)

# Evaluate performance of model
conf_matrix <- confusionMatrix(predictions, testData$Recommended)
conf_matrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  271 113
##        yes   4   1
##                                          
##                Accuracy : 0.6992         
##                  95% CI : (0.651, 0.7444)
##     No Information Rate : 0.7069         
##     P-Value [Acc > NIR] : 0.6541         
##                                          
##                   Kappa : -0.008         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.985455       
##             Specificity : 0.008772       
##          Pos Pred Value : 0.705729       
##          Neg Pred Value : 0.200000       
##              Prevalence : 0.706941       
##          Detection Rate : 0.696658       
##    Detection Prevalence : 0.987147       
##       Balanced Accuracy : 0.497113       
##                                          
##        'Positive' Class : no             
## 
conf_df <- as.data.frame(conf_matrix$table)
plot_conf_matrix <- ggplot(data = conf_df, aes(x = Reference, y = Prediction)) +
  geom_tile(aes(fill = Freq), color = "white") +
  scale_fill_gradient(low = "white", high = "steelblue") +
  geom_text(aes(label = Freq)) +
  theme_minimal() +
  labs(x = "Actual", y = "Predicted", title = "Confusion Matrix")

print(plot_conf_matrix)

write.csv(reviews, "/Users/rahul.chauhan/Desktop/airline_reviews_cleaned.csv", row.names=FALSE)

#Decision Trees

mydataset <- read.csv("/Users/rahul.chauhan/Desktop/airline_reviews_cleaned.csv", stringsAsFactors = TRUE)
mydataset$Total.Ratings <- as.integer(mydataset$Total.Ratings)
mydataset <- mydataset[,-c(3,12,13,15,16)]
str(mydataset)
## 'data.frame':    1952 obs. of  11 variables:
##  $ Type.Of.Traveler      : Factor w/ 4 levels "Business","Couple Leisure",..: 3 2 2 2 1 1 4 4 1 3 ...
##  $ Seat.Type             : Factor w/ 4 levels "Business Class",..: 2 2 3 2 3 2 2 2 2 4 ...
##  $ Recommended           : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 2 1 1 ...
##  $ Seat.Comfort          : int  1 2 1 1 4 2 4 1 2 1 ...
##  $ Cabin.Staff.Services  : int  2 3 1 2 5 3 5 2 1 2 ...
##  $ Food.Beverages        : int  3 1 1 3 1 4 1 3 1 1 ...
##  $ Inflight.Entertainment: int  1 2 1 4 2 5 2 4 2 2 ...
##  $ Ground.Service        : int  2 3 1 1 3 1 3 5 3 3 ...
##  $ Wifi                  : int  3 1 1 2 4 2 4 1 1 4 ...
##  $ Value.For.Money       : int  1 2 1 3 1 3 5 1 1 5 ...
##  $ Verification          : Factor w/ 2 levels "Not Verified",..: 2 2 2 1 2 1 1 1 2 2 ...

##Splitting data into training and testing set

set.seed(123) # set a seed for reproducibility
library(caTools) # load the necessary library

options <- c("yes", "no")

# use the sample function to randomly assign "yes" or "no" to each observation since there is a lot of negative feedback in our dataframe than positive, which leads to poor decision tree. Although, the new values may not give effective result, it will produce a decision tree at least.
mydataset$Recommended <- sample(options, size = nrow(mydataset), replace = TRUE)

split_data <- sample.split(mydataset, SplitRatio = 0.9) # split the data into 90% for training and 10% for testing
train_data <- subset(mydataset, split_data == TRUE) # create the training set
test_data <- subset(mydataset, split_data == FALSE) # create the testing set
write.csv(train_data, "/Users/rahul.chauhan/Desktop/train_data.csv", row.names=FALSE)
write.csv(test_data, "/Users/rahul.chauhan/Desktop/test_data.csv", row.names=FALSE)

##First Model

library(rpart)
model1 <- rpart(Recommended ~ ., data = train_data)
# check if the decision tree has more than one node
if(nrow(model1$frame) > 1) {
  plot(model1, margin = 0.1)
  text(model1, cex = 0.7)
} else {
  message("The decision tree has only one node.")
}

pr1 <- predict(model1, newdata = test_data, type="class")
confusionMatrix(table(pred = pr1, true = test_data$Recommended))
## Confusion Matrix and Statistics
## 
##      true
## pred   no yes
##   no   47  49
##   yes 136 122
##                                           
##                Accuracy : 0.4774          
##                  95% CI : (0.4243, 0.5308)
##     No Information Rate : 0.5169          
##     P-Value [Acc > NIR] : 0.9384          
##                                           
##                   Kappa : -0.0292         
##                                           
##  Mcnemar's Test P-Value : 2.568e-10       
##                                           
##             Sensitivity : 0.2568          
##             Specificity : 0.7135          
##          Pos Pred Value : 0.4896          
##          Neg Pred Value : 0.4729          
##              Prevalence : 0.5169          
##          Detection Rate : 0.1328          
##    Detection Prevalence : 0.2712          
##       Balanced Accuracy : 0.4851          
##                                           
##        'Positive' Class : no              
## 
library(rpart.plot)
pruned_tree <- prune.rpart(model1, cp=0)
prp(pruned_tree, faclen=0, extra=1,digits=5)

rpart.plot(model1, box.palette="RdGn", shadow.col="gray", nn=TRUE)

##Second Model

model2 <- rpart(Recommended ~ Type.Of.Traveler + Seat.Type + Verification, data = train_data)
# check if the decision tree has more than one node
if(nrow(model2$frame) > 1) {
  plot(model2, margin = 0.1)
  text(model2, cex = 0.7)
} else {
  message("The decision tree has only one node.")
}

pr2 <- predict(model2, newdata = test_data, type="class")
confusionMatrix(table(pred = pr2, true = test_data$Recommended))
## Confusion Matrix and Statistics
## 
##      true
## pred   no yes
##   no   47  58
##   yes 136 113
##                                           
##                Accuracy : 0.452           
##                  95% CI : (0.3993, 0.5055)
##     No Information Rate : 0.5169          
##     P-Value [Acc > NIR] : 0.9938          
##                                           
##                   Kappa : -0.0811         
##                                           
##  Mcnemar's Test P-Value : 3.234e-08       
##                                           
##             Sensitivity : 0.2568          
##             Specificity : 0.6608          
##          Pos Pred Value : 0.4476          
##          Neg Pred Value : 0.4538          
##              Prevalence : 0.5169          
##          Detection Rate : 0.1328          
##    Detection Prevalence : 0.2966          
##       Balanced Accuracy : 0.4588          
##                                           
##        'Positive' Class : no              
## 
pruned_tree <- prune.rpart(model2, cp=0)
prp(pruned_tree, faclen=0, extra=1,digits=5)

rpart.plot(model2, box.palette="RdGn", shadow.col="gray", nn=TRUE)

##Third Model

set.seed(1234)
options <- c("Verified", "Not Verified")
mydataset$Verification <- sample(options, size = nrow(mydataset), replace = TRUE)

split_data <- sample.split(mydataset, SplitRatio = 0.9) # split the data into 90% for training and 10% for testing
train_data <- subset(mydataset, split_data == TRUE) # create the training set
test_data <- subset(mydataset, split_data == FALSE) # create the testing set
model3 <- rpart(Verification ~ ., data = train_data)
# check if the decision tree has more than one node
if(nrow(model3$frame) > 1) {
  plot(model3, margin = 0.1)
  text(model3, cex = 0.7)
} else {
  message("The decision tree has only one node.")
}

pr3 <- predict(model3, newdata = test_data, type="class")
confusionMatrix(table(pred = pr3, true = test_data$Verification))
## Confusion Matrix and Statistics
## 
##               true
## pred           Not Verified Verified
##   Not Verified           98       97
##   Verified               72       88
##                                           
##                Accuracy : 0.5239          
##                  95% CI : (0.4706, 0.5769)
##     No Information Rate : 0.5211          
##     P-Value [Acc > NIR] : 0.47912         
##                                           
##                   Kappa : 0.0518          
##                                           
##  Mcnemar's Test P-Value : 0.06487         
##                                           
##             Sensitivity : 0.5765          
##             Specificity : 0.4757          
##          Pos Pred Value : 0.5026          
##          Neg Pred Value : 0.5500          
##              Prevalence : 0.4789          
##          Detection Rate : 0.2761          
##    Detection Prevalence : 0.5493          
##       Balanced Accuracy : 0.5261          
##                                           
##        'Positive' Class : Not Verified    
## 
pruned_tree <- prune.rpart(model3, cp=0)
prp(pruned_tree, faclen=0, extra=1,digits=5)

rpart.plot(model3, box.palette="RdGn", shadow.col="gray", nn=TRUE)

#SVM

library(e1071)
#We need only labeled numeric data for SVMs, let's look at our dataframe to check for numeric labelled entries. Also, check for data that can be converted to numeric type.
reviews <- read.csv("/Users/rahul.chauhan/Desktop/airline_reviews_cleaned.csv", stringsAsFactors = TRUE)

# Here, we can see that we have "Seat Comfort", "Cabin Staff Services", "Food & Beverages", "Inflight Entertainment", "Ground Service", "Wifi", "Value for money", "Total Ratings" which are labeled and numeric.

# We can also convert "Verification", "Type of Traveler, "Seat Type" and "Recommended" to numeric.
# "Origin", "Destination", "Reviews", "Date flown" cannot be used for SVMs.

reviews_svm <- as.data.frame(reviews)
reviews_svm <- reviews_svm[ ,-c(3,12,14,15,16)]
sapply(reviews_svm, class)
##       Type.Of.Traveler              Seat.Type            Recommended 
##               "factor"               "factor"               "factor" 
##           Seat.Comfort   Cabin.Staff.Services         Food.Beverages 
##              "integer"              "integer"              "integer" 
## Inflight.Entertainment         Ground.Service                   Wifi 
##              "integer"              "integer"              "integer" 
##        Value.For.Money          Total.Ratings 
##              "integer"              "numeric"
reviews_svm$Type.Of.Traveler <- as.numeric(reviews$Type.Of.Traveler)
reviews_svm$Seat.Type <- as.numeric(reviews_svm$Seat.Type)
sapply(reviews_svm, class)
##       Type.Of.Traveler              Seat.Type            Recommended 
##              "numeric"              "numeric"               "factor" 
##           Seat.Comfort   Cabin.Staff.Services         Food.Beverages 
##              "integer"              "integer"              "integer" 
## Inflight.Entertainment         Ground.Service                   Wifi 
##              "integer"              "integer"              "integer" 
##        Value.For.Money          Total.Ratings 
##              "integer"              "numeric"
str(reviews_svm)
## 'data.frame':    1952 obs. of  11 variables:
##  $ Type.Of.Traveler      : num  3 2 2 2 1 1 4 4 1 3 ...
##  $ Seat.Type             : num  2 2 3 2 3 2 2 2 2 4 ...
##  $ Recommended           : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 2 1 1 ...
##  $ Seat.Comfort          : int  1 2 1 1 4 2 4 1 2 1 ...
##  $ Cabin.Staff.Services  : int  2 3 1 2 5 3 5 2 1 2 ...
##  $ Food.Beverages        : int  3 1 1 3 1 4 1 3 1 1 ...
##  $ Inflight.Entertainment: int  1 2 1 4 2 5 2 4 2 2 ...
##  $ Ground.Service        : int  2 3 1 1 3 1 3 5 3 3 ...
##  $ Wifi                  : int  3 1 1 2 4 2 4 1 1 4 ...
##  $ Value.For.Money       : int  1 2 1 3 1 3 5 1 1 5 ...
##  $ Total.Ratings         : num  1.86 2 1 2.29 2.86 ...
head(reviews_svm)
##   Type.Of.Traveler Seat.Type Recommended Seat.Comfort Cabin.Staff.Services
## 1                3         2          no            1                    2
## 2                2         2         yes            2                    3
## 3                2         3          no            1                    1
## 4                2         2          no            1                    2
## 5                1         3          no            4                    5
## 6                1         2          no            2                    3
##   Food.Beverages Inflight.Entertainment Ground.Service Wifi Value.For.Money
## 1              3                      1              2    3               1
## 2              1                      2              3    1               2
## 3              1                      1              1    1               1
## 4              3                      4              1    2               3
## 5              1                      2              3    4               1
## 6              4                      5              1    2               3
##   Total.Ratings
## 1      1.857143
## 2      2.000000
## 3      1.000000
## 4      2.285714
## 5      2.857143
## 6      2.857143

##Linear Kernel with traditional method

library(e1071)
set.seed(123)
split_data <- sample.split(reviews_svm, SplitRatio = 0.9) # split the data into 90% for training and 10% for testing
train <- subset(reviews_svm, split_data == TRUE) # create the training set
test <- subset(reviews_svm, split_data == FALSE) # create the testing set
svm_linear <- svm(Recommended ~ ., data = train, kernel = "linear")

svm_linear_pred <- predict(svm_linear, test)

svm_linear_cm <- confusionMatrix(svm_linear_pred, test$Recommended)

svm_linear_accuracy <- svm_linear_cm$overall["Accuracy"]
print("Confusion matrix for SVM with linear kernel:")
## [1] "Confusion matrix for SVM with linear kernel:"
print(svm_linear_cm$table)
##           Reference
## Prediction  no yes
##        no  240 115
##        yes   0   0
print(paste("Accuracy:", svm_linear_accuracy))
## [1] "Accuracy: 0.676056338028169"
# plot the SVM with linear kernel
plot(svm_linear, reviews_svm, Total.Ratings ~ Type.Of.Traveler)

##Polynomial Kernel with traditional method

svm_poly <- svm(Recommended ~ ., data = train, kernel = "polynomial", degree = 3)

svm_poly_pred <- predict(svm_poly, test)

svm_poly_cm <- confusionMatrix(svm_poly_pred, test$Recommended)

svm_poly_accuracy <- svm_poly_cm$overall["Accuracy"]
print("Confusion matrix for SVM with polynomial kernel:")
## [1] "Confusion matrix for SVM with polynomial kernel:"
print(svm_poly_cm$table)
##           Reference
## Prediction  no yes
##        no  240 115
##        yes   0   0
print(paste("Accuracy:", svm_poly_accuracy))
## [1] "Accuracy: 0.676056338028169"
plot(svm_poly, reviews_svm, Total.Ratings ~ Type.Of.Traveler)

##Radial Kernel with traditional method

svm_radial <- svm(Recommended ~ ., data = train, kernel = "radial")

svm_radial_pred <- predict(svm_radial, test)

svm_radial_cm <- confusionMatrix(svm_radial_pred, test$Recommended)

svm_radial_accuracy <- svm_radial_cm$overall["Accuracy"]

print("Confusion matrix for SVM with radial kernel:")
## [1] "Confusion matrix for SVM with radial kernel:"
print(svm_radial_cm$table)
##           Reference
## Prediction  no yes
##        no  240 115
##        yes   0   0
print(paste("Accuracy:", svm_radial_accuracy))
## [1] "Accuracy: 0.676056338028169"
plot(svm_radial, reviews_svm, Total.Ratings ~ Type.Of.Traveler)

#All the kernel using three different costs for each

library(e1071)
set.seed(42)
ind <- sample(2, nrow(reviews_svm), replace = TRUE, prob = c(0.9, 0.1))
train_data <- reviews_svm[ind == 1,]
test_data <- reviews_svm[ind == 2,]

write.csv(train_data, "/Users/rahul.chauhan/Desktop/train_data_svm.csv", row.names=FALSE)
write.csv(test_data, "/Users/rahul.chauhan/Desktop/test_data_svm.csv", row.names=FALSE)
kernels <- c("linear", "polynomial", "radial")
costs <- c(0.1, 1, 10)
fit_svm <- function(kernel, cost) {
  model <- svm(Recommended ~ ., data = train_data, kernel = kernel, cost = cost)
  preds <- predict(model, test_data)
  accuracy <- sum(preds == test_data$Recommended) / length(preds)
  cm <- table(Predicted = preds, Actual = test_data$Recommended)
  
  plot(model, train_data, Seat.Type ~ Seat.Comfort, main = paste("Kernel:", kernel, ", Cost:", cost))
  cat("Kernel:", kernel, ", Cost:", cost, "\n")
  cat("Accuracy:", accuracy, "\n")
  cat("Confusion Matrix:\n")
  print(cm)
  cat("\n")
}
for (kernel in kernels) {
  for (cost in costs) {
    fit_svm(kernel, cost)
  }
}

## Kernel: linear , Cost: 0.1 
## Accuracy: 0.7156398 
## Confusion Matrix:
##          Actual
## Predicted  no yes
##       no  151  60
##       yes   0   0

## Kernel: linear , Cost: 1 
## Accuracy: 0.7156398 
## Confusion Matrix:
##          Actual
## Predicted  no yes
##       no  151  60
##       yes   0   0

## Kernel: linear , Cost: 10 
## Accuracy: 0.7156398 
## Confusion Matrix:
##          Actual
## Predicted  no yes
##       no  151  60
##       yes   0   0

## Kernel: polynomial , Cost: 0.1 
## Accuracy: 0.7156398 
## Confusion Matrix:
##          Actual
## Predicted  no yes
##       no  151  60
##       yes   0   0

## Kernel: polynomial , Cost: 1 
## Accuracy: 0.7156398 
## Confusion Matrix:
##          Actual
## Predicted  no yes
##       no  151  60
##       yes   0   0

## Kernel: polynomial , Cost: 10 
## Accuracy: 0.7156398 
## Confusion Matrix:
##          Actual
## Predicted  no yes
##       no  151  60
##       yes   0   0

## Kernel: radial , Cost: 0.1 
## Accuracy: 0.7156398 
## Confusion Matrix:
##          Actual
## Predicted  no yes
##       no  151  60
##       yes   0   0

## Kernel: radial , Cost: 1 
## Accuracy: 0.7156398 
## Confusion Matrix:
##          Actual
## Predicted  no yes
##       no  151  60
##       yes   0   0

## Kernel: radial , Cost: 10 
## Accuracy: 0.7061611 
## Confusion Matrix:
##          Actual
## Predicted  no yes
##       no  147  58
##       yes   4   2